{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 02 淘米洗菜-数据预处理\n",
    "从菜市场买回来的菜，总有一些不好的，买回来后要先做一遍预处理，把那些不好的部分扔掉。现实中的数据也是如此。\n",
    "\n",
    "常见不规整的数据有**缺失数据**、**重复数据**、**异常数据**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "# 一个cell输出多行语句\n",
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = \"all\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 一、缺失值处理\n",
    "处理方式：**删除**、**填充**\n",
    "### 1.1 缺失值查看\n",
    "isnull()方法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>54.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A2</td>\n",
       "      <td>16.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A3</td>\n",
       "      <td>47.0</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A5</td>\n",
       "      <td>41.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号    年龄   性别       注册时间\n",
       "0  A1  54.0    男 2018-08-08\n",
       "1  A2  16.0  NaN 2018-08-09\n",
       "2  A3  47.0    女 2018-08-10\n",
       "3  A4   NaN  NaN        NaT\n",
       "4  A5  41.0    男 2018-08-11"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5 entries, 0 to 4\n",
      "Data columns (total 4 columns):\n",
      "编号      5 non-null object\n",
      "年龄      4 non-null float64\n",
      "性别      3 non-null object\n",
      "注册时间    4 non-null datetime64[ns]\n",
      "dtypes: datetime64[ns](1), float64(1), object(2)\n",
      "memory usage: 288.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_excel('./data/fillna.xlsx')\n",
    "df\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      编号     年龄     性别   注册时间\n",
       "0  False  False  False  False\n",
       "1  False  False   True  False\n",
       "2  False  False  False  False\n",
       "3  False   True   True   True\n",
       "4  False  False  False  False"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.2 缺失值删除\n",
    "缺失值分为两种：一行中某个字段为缺失值、一行中的字段全部为缺失值\n",
    "\n",
    "dropna()方法默认删除含有缺失值的行，即只要某一行有缺失值，就删除\n",
    "\n",
    "Excel实现：通过快捷键**Ctrl+G**弹出定位条件，选择空值找到。这样所有缺失的部分全部选中，然后鼠标右键删除（整行、整列、单元格）即可"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>54.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A3</td>\n",
       "      <td>47.0</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A5</td>\n",
       "      <td>41.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号    年龄 性别       注册时间\n",
       "0  A1  54.0  男 2018-08-08\n",
       "2  A3  47.0  女 2018-08-10\n",
       "4  A5  41.0  男 2018-08-11"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>54.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A2</td>\n",
       "      <td>16.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A3</td>\n",
       "      <td>47.0</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A5</td>\n",
       "      <td>41.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号    年龄   性别       注册时间\n",
       "0  A1  54.0    男 2018-08-08\n",
       "1  A2  16.0  NaN 2018-08-09\n",
       "2  A3  47.0    女 2018-08-10\n",
       "3  A4   NaN  NaN        NaT\n",
       "4  A5  41.0    男 2018-08-11"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dropna(how='all')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.3 缺失值的填充\n",
    "数据是宝贵的，一般情况，数据缺失比例不是过高（不大于30%），尽量不删除，选择填充。\n",
    "\n",
    "Excel实现：**Ctrl+G**弹出定位条件找到空值，填充0，然后按**Ctrl+Enter**组合键，所有的值将会填充\n",
    "\n",
    "除了0填充，还有平均值、众数、向前填充（用缺失值的前一个非缺失值填充）、向后填充等\n",
    "\n",
    "**fillna()方法**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>54.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08 00:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A2</td>\n",
       "      <td>16.0</td>\n",
       "      <td>0</td>\n",
       "      <td>2018-08-09 00:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A3</td>\n",
       "      <td>47.0</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10 00:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A5</td>\n",
       "      <td>41.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11 00:00:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号    年龄 性别                 注册时间\n",
       "0  A1  54.0  男  2018-08-08 00:00:00\n",
       "1  A2  16.0  0  2018-08-09 00:00:00\n",
       "2  A3  47.0  女  2018-08-10 00:00:00\n",
       "3  A4   0.0  0                    0\n",
       "4  A5  41.0  男  2018-08-11 00:00:00"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>54.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A2</td>\n",
       "      <td>16.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A3</td>\n",
       "      <td>47.0</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>男</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A5</td>\n",
       "      <td>41.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号    年龄 性别       注册时间\n",
       "0  A1  54.0  男 2018-08-08\n",
       "1  A2  16.0  男 2018-08-09\n",
       "2  A3  47.0  女 2018-08-10\n",
       "3  A4   NaN  男        NaT\n",
       "4  A5  41.0  男 2018-08-11"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 指定列名填充\n",
    "df.fillna({'性别':'男'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>54</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A2</td>\n",
       "      <td>16</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A3</td>\n",
       "      <td>47</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A4</td>\n",
       "      <td>30</td>\n",
       "      <td>男</td>\n",
       "      <td>NaT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A5</td>\n",
       "      <td>41</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号  年龄 性别       注册时间\n",
       "0  A1  54  男 2018-08-08\n",
       "1  A2  16  男 2018-08-09\n",
       "2  A3  47  女 2018-08-10\n",
       "3  A4  30  男        NaT\n",
       "4  A5  41  男 2018-08-11"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 同时指定多列填充\n",
    "df.fillna({'性别':'男', '年龄':'30'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 二、重复值处理\n",
    "重复数据一般做删除处理\n",
    "\n",
    "Excel实现：数据-》数据工具-》删除重复值\n",
    "\n",
    "drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单编号</th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  订单编号 客户姓名  唯一识别码       成交时间\n",
       "0   A1  张 通    101 2018-08-08\n",
       "1   A2   李谷    102 2018-08-09\n",
       "2   A3   孙凤    103 2018-08-10\n",
       "3   A3   孙凤    103 2018-08-10\n",
       "4   A4   赵恒    104 2018-08-11\n",
       "5   A5   赵恒    104 2018-08-12"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单编号</th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  订单编号 客户姓名  唯一识别码       成交时间\n",
       "0   A1  张 通    101 2018-08-08\n",
       "1   A2   李谷    102 2018-08-09\n",
       "2   A3   孙凤    103 2018-08-10\n",
       "4   A4   赵恒    104 2018-08-11\n",
       "5   A5   赵恒    104 2018-08-12"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_excel('./data/delete_repeat.xlsx')\n",
    "df\n",
    "# 默认对所有字段进行重复值判断\n",
    "df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单编号</th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  订单编号 客户姓名  唯一识别码       成交时间\n",
       "0   A1  张 通    101 2018-08-08\n",
       "1   A2   李谷    102 2018-08-09\n",
       "2   A3   孙凤    103 2018-08-10\n",
       "4   A4   赵恒    104 2018-08-11"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 指定列名，只针对某一列或某几列进行重复值删除判断\n",
    "df.drop_duplicates(subset='唯一识别码')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单编号</th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  订单编号 客户姓名  唯一识别码       成交时间\n",
       "0   A1  张 通    101 2018-08-08\n",
       "1   A2   李谷    102 2018-08-09\n",
       "2   A3   孙凤    103 2018-08-10\n",
       "4   A4   赵恒    104 2018-08-11"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 指定多列去重\n",
    "df.drop_duplicates(subset=['客户姓名','唯一识别码'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单编号</th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  订单编号 客户姓名  唯一识别码       成交时间\n",
       "0   A1  张 通    101 2018-08-08\n",
       "1   A2   李谷    102 2018-08-09\n",
       "3   A3   孙凤    103 2018-08-10\n",
       "5   A5   赵恒    104 2018-08-12"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单编号</th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  订单编号 客户姓名  唯一识别码       成交时间\n",
       "0   A1  张 通    101 2018-08-08\n",
       "1   A2   李谷    102 2018-08-09"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 自定义删除重复项时保留哪个，默认保留第一个，也可以设置保留最后一个，或者全部不保留\n",
    "# keep参数\n",
    "# first：保留第一个\n",
    "# last：保留最后一个值\n",
    "# False：重复值全部删除\n",
    "df.drop_duplicates(subset=['客户姓名','唯一识别码'], keep='last')\n",
    "df.drop_duplicates(subset=['客户姓名','唯一识别码'], keep=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 三、异常值的检测与处理\n",
    "异常值：相比正常数据过低或过高的数据，比如：一个人的年龄是0岁或300岁\n",
    "### 3.1 异常值检测\n",
    "发现异常值方式：\n",
    "* 根据业务经验划定不同指标的正常范围，超出该范围算异常值\n",
    "* 绘制箱型图，大于（小于）箱型图上（下）边缘的点算异常值\n",
    "* 如果数据服从正太分布，则可以利用3σ（西格玛）原则：如果一个数值与平均值之间的偏差超过3倍标准差，那么这个值就是异常值\n",
    "\n",
    "<center>\n",
    "    <img src='./image/xxt.jpg' width='30%'>\n",
    "    <p style='text-align:center;font-size:14px'>箱型图</p>\n",
    "</center>\n",
    "\n",
    "<center>\n",
    "    <img src='./image/ztfbt.jpg' width='50%'>\n",
    "    <p style='text-align:center;font-size:14px'>状态分布图</p>\n",
    "</center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.2 异常值处理\n",
    "几种处理方式：\n",
    "* 最常用的：删除\n",
    "* 将异常值当缺失值来填充（Excel实现方式：替换；Python：replace()）\n",
    "* 把异常值当特殊情况，研究异常值出现的原因"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 四、数据类型转换\n",
    "### 4.1 Python6种数据类型\n",
    "\n",
    "<img src='./image/6type.jpg' width='80%' style='float:left'>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**dtype方法**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dtype('O')"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "dtype('int64')"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['订单编号'].dtype\n",
    "df['唯一识别码'].dtype"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.2 类型转换\n",
    "不同的数据类型可以做的事情是不一样的\n",
    "\n",
    "**astype方法**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    101.0\n",
       "1    102.0\n",
       "2    103.0\n",
       "3    103.0\n",
       "4    104.0\n",
       "5    104.0\n",
       "Name: 唯一识别码, dtype: float64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['唯一识别码'].astype('float64')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 五、索引设置\n",
    "索引是查找数据的依据，设置索引的目的是便于我们查找数据。例如：从超市买菜回来，放在冰箱里，放的过程就是建立索引的过程，如蔬菜放在冷藏室，肉类放在冷冻室。\n",
    "### 5.1 为无索引列表添加索引\n",
    "Python中，如果表没有索引，会默认从0开始的自然数做索引。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Unnamed: 1</th>\n",
       "      <th>Unnamed: 2</th>\n",
       "      <th>Unnamed: 3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Unnamed: 0 Unnamed: 1  Unnamed: 2 Unnamed: 3\n",
       "0         A1        张 通         101 2018-08-08\n",
       "1         A2         李谷         102 2018-08-09\n",
       "2         A3         孙凤         103 2018-08-10\n",
       "3         A4         赵恒         104 2018-08-11\n",
       "4         A5         赵恒         104 2018-08-12"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_excel('./data/no_index.xlsx')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单编号</th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  订单编号 客户姓名  唯一识别码       成交时间\n",
       "0   A1  张 通    101 2018-08-08\n",
       "1   A2   李谷    102 2018-08-09\n",
       "2   A3   孙凤    103 2018-08-10\n",
       "3   A4   赵恒    104 2018-08-11\n",
       "4   A5   赵恒    104 2018-08-12"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单编号</th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  订单编号 客户姓名  唯一识别码       成交时间\n",
       "1   A1  张 通    101 2018-08-08\n",
       "2   A2   李谷    102 2018-08-09\n",
       "3   A3   孙凤    103 2018-08-10\n",
       "4   A4   赵恒    104 2018-08-11\n",
       "5   A5   赵恒    104 2018-08-12"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# df的columns传入列索引值，index传入行索引值。以达到为无索引列表添加索引的目的。\n",
    "df.columns = ['订单编号', '客户姓名', '唯一识别码', '成交时间']\n",
    "df\n",
    "df.index = [1, 2, 3, 4, 5]\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.2 重新设置索引\n",
    "一般重新设置行索引\n",
    "\n",
    "Excel实现：将要做行索引的列拖至第一列的位置即可"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>订单编号</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     客户姓名  唯一识别码       成交时间\n",
       "订单编号                       \n",
       "A1    张 通    101 2018-08-08\n",
       "A2     李谷    102 2018-08-09\n",
       "A3     孙凤    103 2018-08-10\n",
       "A4     赵恒    104 2018-08-11\n",
       "A5     赵恒    104 2018-08-12"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.set_index('订单编号')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**层次化索引**\n",
    "\n",
    "**set_index()**方法传入两个或多个列名。层次化索引一般用在某一列含有多个重复值的情况.\n",
    "\n",
    "例子如下：其中a、b、c、d分别有多个重复\n",
    "\n",
    "<img src='./image/set_index.jpg' width='20%' style='float:left'>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.3 重命名索引\n",
    "针对现有索引名进行修改，即改字段名\n",
    "\n",
    "**rename()**方法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>新订单编号</th>\n",
       "      <th>新客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  新订单编号 新客户姓名  唯一识别码       成交时间\n",
       "1    A1   张 通    101 2018-08-08\n",
       "2    A2    李谷    102 2018-08-09\n",
       "3    A3    孙凤    103 2018-08-10\n",
       "4    A4    赵恒    104 2018-08-11\n",
       "5    A5    赵恒    104 2018-08-12"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单编号</th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  订单编号 客户姓名  唯一识别码       成交时间\n",
       "1   A1  张 通    101 2018-08-08\n",
       "2   A2   李谷    102 2018-08-09\n",
       "3   A3   孙凤    103 2018-08-10\n",
       "4   A4   赵恒    104 2018-08-11\n",
       "5   A5   赵恒    104 2018-08-12"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单编号</th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>一</td>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>二</td>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>三</td>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  订单编号 客户姓名  唯一识别码       成交时间\n",
       "一   A1  张 通    101 2018-08-08\n",
       "二   A2   李谷    102 2018-08-09\n",
       "三   A3   孙凤    103 2018-08-10\n",
       "4   A4   赵恒    104 2018-08-11\n",
       "5   A5   赵恒    104 2018-08-12"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>新订单编号</th>\n",
       "      <th>新客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>一</td>\n",
       "      <td>A1</td>\n",
       "      <td>张 通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>二</td>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>三</td>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  新订单编号 新客户姓名  唯一识别码       成交时间\n",
       "一    A1   张 通    101 2018-08-08\n",
       "二    A2    李谷    102 2018-08-09\n",
       "三    A3    孙凤    103 2018-08-10\n",
       "4    A4    赵恒    104 2018-08-11\n",
       "5    A5    赵恒    104 2018-08-12"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.rename(columns={'订单编号':'新订单编号', '客户姓名':'新客户姓名'})\n",
    "df\n",
    "df.rename(index={1:'一', 2:'二', 3:'三'})\n",
    "df.rename(columns={'订单编号':'新订单编号', '客户姓名':'新客户姓名'}, index={1:'一', 2:'二', 3:'三'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.4 重置索引\n",
    "重置索引主要用在层次化索引中，重置索引是将索引列当作一个**columns**进行返回\n",
    "\n",
    "**reset_index(level=None, drop=False, inplace=False)**方法，用于数组分组、数据透视中\n",
    "\n",
    "* level：指定要将层次化索引的第几级别转化为columns，第一个索引为0级，第二个索引为1级，默认为全部索引，即默认将索引全部转换为columns\n",
    "* drop：指定是否将原索引删掉，即不作为一个新的索引，默认为False，即不删掉原索引\n",
    "* inplace：指定是否修改原数据表\n",
    "\n",
    "<center>\n",
    "    <img src='./image/reset_index.jpg' width='80%'>\n",
    "</center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 1],\n",
       "       [2, 3],\n",
       "       [4, 5],\n",
       "       [6, 7]])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>C1</th>\n",
       "      <th>C2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Z1</th>\n",
       "      <th>Z2</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td rowspan=\"2\" valign=\"top\">A</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>b</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td rowspan=\"2\" valign=\"top\">B</td>\n",
       "      <td>a</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>b</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      C1 C2\n",
       "Z1 Z2      \n",
       "A  a   1  2\n",
       "   b   3  4\n",
       "B  a   5  6\n",
       "   b   7  8"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>Z1</th>\n",
       "      <th>Z2</th>\n",
       "      <th>C1</th>\n",
       "      <th>C2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A</td>\n",
       "      <td>b</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>B</td>\n",
       "      <td>a</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>B</td>\n",
       "      <td>b</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Z1 Z2 C1 C2\n",
       "0  A  a  1  2\n",
       "1  A  b  3  4\n",
       "2  B  a  5  6\n",
       "3  B  b  7  8"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>Z1</th>\n",
       "      <th>C1</th>\n",
       "      <th>C2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Z2</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>a</td>\n",
       "      <td>A</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>b</td>\n",
       "      <td>A</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>a</td>\n",
       "      <td>B</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>b</td>\n",
       "      <td>B</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Z1 C1 C2\n",
       "Z2         \n",
       "a   A  1  2\n",
       "b   A  3  4\n",
       "a   B  5  6\n",
       "b   B  7  8"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>Z2</th>\n",
       "      <th>C1</th>\n",
       "      <th>C2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Z1</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>A</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>A</td>\n",
       "      <td>b</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>B</td>\n",
       "      <td>a</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>B</td>\n",
       "      <td>b</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Z2 C1 C2\n",
       "Z1         \n",
       "A   a  1  2\n",
       "A   b  3  4\n",
       "B   a  5  6\n",
       "B   b  7  8"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>C1</th>\n",
       "      <th>C2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  C1 C2\n",
       "0  1  2\n",
       "1  3  4\n",
       "2  5  6\n",
       "3  7  8"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# df = pd.read_excel('./data/reset_index.xlsx')\n",
    "# arange: Python内置函数range数组版\n",
    "# 0~7分布到4行2列，+1所有的值都加1\n",
    "np.arange(8).reshape(4, 2)\n",
    "df = pd.DataFrame(np.arange(8).reshape(4, 2) + 1, index=[['A', 'A', 'B', 'B'], ['a', 'b', 'a', 'b']],\n",
    "                  columns=[['C1', 'C2']])\n",
    "# 设置索引名\n",
    "df.index.names = ['Z1', 'Z2']\n",
    "df\n",
    "df.reset_index()\n",
    "df.reset_index(level = 0)\n",
    "df.reset_index(level = 1)\n",
    "# 原索引删除\n",
    "df.reset_index(drop = True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
